# -*- coding: utf-8 -*-
import scrapy, random
from scrapy.http import Request, FormRequest
import re, urllib.request
from zhider.items import TopicItem

ua = ["Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36"]


class TopicSpider(scrapy.Spider):
    name = 'Topic'
    allowed_domains = ['www.zhihu.com']
    start_urls = ['http://www.zhihu.com/']
    headers = {"User-Agent": random.choice(ua),
               "Referer": "https://www.zhihu.com"}

    def parse(self, response):
        pass

    def start_requests(self):
        url = "https://www.zhihu.com/topics"
        return [Request(url, meta={"cookiejar":True}, headers=self.headers,encoding="utf-8", callback=self.savetopic)]

    def savetopic(self,response):
        html = response.body.decode("utf-8","ignore")
        pattern = "<li[^>]*data-id=\"(\d+)\"[^>]*><a[^>]*>([^<]*)</a></li>"
        p = re.compile(pattern)
        r = p.findall(html)
        print("crawl huatiguangchang ")
        for i in r:
            item = TopicItem()
            item["id"] = i[0]
            item["title"] = i[1]
            item["pid"] = 0
            yield item

